# Libraries
library(tidyverse)
library(here)
library(lubridate)
library(meta)
library(knitr)
library(rsq)
library(car)
library(vroom)
# Functions
#source(here("Script/Functions", ""))
We first ran a genome-wide association study in the UK Biobank European cohort with gout as the outcome. This was done with a total of 27,287,012 variants (after imputation), and adjusted for age, sex, and the first 40 principal components. Below, we read in the summary statistics. We did some initial filtering to remove X and Y chromosome SNPs, and most indels. We also removed variants that were not in the imputed CoreExome genotype data. We then filtered out multi-allelic variants and variants with MAF < 0.01. For the Polynesian analysis, we additionally filtered out variants that were not directly genotyped in the CoreExome. Finally, we filter out all SNPs with P-values greater than 5e-8.
# Reading in summary stats
if(file.exists(here("Output/Temp/sumstat5.RData"))) {
#load(here("Output/Temp/sumstat5.RData"))
} else {
if(file.exists(here("Output/Temp/sumstat3.RData"))) {
load(here("Output/Temp/sumstat3.RData"))
} else {
sumstat <- vroom(here("Data/GWAS/ukbb_gout-allcontrol_chr1-22.X.XY.add_unfiltered_p.tsv"), delim = "\t", col_names = TRUE) # start with 27,287,012 variants in the summary stats
sumstat2 <- sumstat %>%
filter(str_length(A1) == 1,
CHR %in% 1:22) # Removing 738,219 indels and keeping only autosomes (removes a further 1,069,966 variants) = 25,478,827 variants
sumstat2_1 <- sumstat2 %>%
mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,1],
SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,2]) # this takes forever to run
# test <- sumstat2_1 %>%
# filter(str_detect(SNP2, ",")) %>%
# separate(SNP2, into = c("SNP2", "extra"), sep = ",") # 1,696 variants had an extra comma in their ID
#
# test1 <- test %>%
# mutate(extra = as.numeric(extra)) %>%
# filter(extra != CHR) # All just had their chromosome attached, can just remove the extra bit
sumstat2_2 <- sumstat2 %>%
mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,1],
SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,2])
# test <- sumstat2_2 %>% filter(str_detect(SNP2, ",")) # none with ,
test1 <- sumstat2_2 %>%
filter(str_detect(SNP1, regex("^rs[0-9]+"))) # 24,248,522 variants have an rsID in SNP1 column
# test1_1 <- test1 %>%
# filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 633,645 of these have an rsID in SNP2 column
#
# test1_1_1 <- test1_1 %>%
# filter(SNP1 != SNP2) # 1,367 variants have two rsIDs, for the most part SNP1 appears to be the newest rsID, but I might want to keep the extra rsID in a separate column
test1_2 <- test1 %>%
filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 23,610,697 variants have rsID in SNP1 column and chr:bp_a1_a2 in SNP2 column
test1_2_1 <- test1_2 %>%
mutate(CHR2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,3])
# tmp <- test1_2_1 %>%
# filter(BP != BP2) # all BPs are equal
#
# tmp <- test1_2_1 %>%
# filter(CHR != CHR2) # all CHRs are equal
#
# tmp <- test1_2_1 %>%
# filter(A1 != Allele2) # Allele2 is not always A1
test1_2_2 <- test1_2_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1) # removes a further 219,771 indels = 23,390,926 SNPs
test1_2_final <- test1_2_2 %>%
select(SNP, Allele1, Allele2)
# test1_3 <- test1 %>%
# filter(!str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$|^rs[0-9]+$"))) # 4,180 variants have neither rsID nor chr:bp_a1_a2 in SNP2 column
#
# test1_4 <- test1_3 %>%
# filter(!str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All of these are in the format Affx-<number>
test1_final <- test1 %>%
mutate(RSID = SNP1,
ALT_RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) & SNP1 != SNP2 ~ SNP2, TRUE ~ NA_character_),
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP2, TRUE ~ NA_character_)) %>%
left_join(test1_2_final, by = "SNP")
test1_final2 <- test1_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
# nrow(test1_final2) - nrow(test1_final) # successfully removed indels from final dataset
# sum(!is.na(test1_final2$RSID))
# sum(!is.na(test1_final2$ALT_RSID))
# sum(!is.na(test1_final2$AFFYID))
# sum(!is.na(test1_final2$SNP_ID))
# sum(!is.na(test1_final2$Allele1))
# sum(!is.na(test1_final2$Allele2))
test2 <- sumstat2_2 %>%
filter(!str_detect(SNP1, regex("^rs[0-9]+"))) # 1,230,305 variants don't have an rsID in SNP1 column
test2_1 <- test2 %>%
filter(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,053 have the SNP_ID format in SNP1
test2_1_1 <- test2_1 %>%
mutate(CHR2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,3])
# tmp <- test2_1_1 %>%
# filter(BP != BP2) # all BPs are equal
#
# tmp <- test2_1_1 %>%
# filter(CHR != CHR2) # all CHRs are equal
test2_1_2 <- test2_1_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1)
# nrow(test2_1_2) - nrow(test2_1_1) # removed 975,583
test2_1_final <- test2_1_2 %>%
select(SNP, Allele1, Allele2)
# test2_1_1 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 23 of these have RSID in SNP2
# test2_1_2 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,029 of these have SNP_ID in SNP2
#
# test2_1_2_1 <- test2_1_2 %>%
# filter(SNP1 != SNP2) # All SNP_ID columns identical in these individuals
# test2_1_3 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # 1 has AffyID in SNP2
# test2_2 <- test2 %>%
# filter(str_detect(SNP1, regex("^Affx-[0-9]+$"))) # 252 have an AffyID in SNP1
#
# test2_2_1 <- test2_2 %>%
# filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All have AffyID in SNP2
#
# test2_2_1_1 <- test2_2_1 %>%
# filter(SNP1 != SNP2) # All AffyID columns identical in these individuals
test2_final <- test2 %>%
mutate(RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
ALT_RSID = NA_character_,
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP1, TRUE ~ NA_character_)) %>%
left_join(test2_1_final, by = "SNP")
test2_final2 <- test2_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
sumstat3 <- rbind(test1_final2, test2_final2) %>% arrange(CHR, BP)
save(sumstat3, file = here("Output/Temp/sumstat3.RData"))
}
# tmp <- sumstat3 %>%
# select(CHR, BP) %>%
# unique()
# for(i in 1:22) {
# write_delim(select(filter(tmp, CHR == i), BP), file = paste0(here("Output/Temp/"), "chr", i, "_snplist.txt"), delim = "\n")
# }
# Now we will work out which of these SNPs are present in the Imputed CoreEx data
#system(paste0('source ~/.bashrc; parallel "zcat /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Imputed_Genotypes/QC1-10_Impute_EUR_only/CZ-MB1.2-QC1.10_EUR_imputed_chr{}.vcf.gz | grep -v ', "'#'",' | cut -f2 | grep -Fwf ', here("Output/Temp/"),'chr{}_snplist.txt > ', here("Output/Temp/"),'matched_snps_chr{}.txt" ::: {1..22}'))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("matched_snps_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
out <- out %>%
select(CHR, X1) %>%
mutate(C_B = paste0(CHR, "_", X1))
sumstat4 <- sumstat3 %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(C_B %in% out$C_B)
rm(sumstat3)
# Now to pull out the MAF for all SNPs
# Already have CHR and BP for all SNPs separated by chr in matched_snps_chr* and mfi files are per chr so can simply match location in grep
#system(paste0('source ~/.bashrc; parallel "grep -Fwhf ', here("Output/Temp/"), 'matched_snps_chr{}.txt /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/splits/ukb_mfi_chr{}_v3.txt > ', here("Output/Temp/"), 'ukb_maf_info_chr{}.txt" ::: {1..22}'))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
# test <- out %>%
# filter(is.na(Allele1) | is.na(Allele2))
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat4$C_B) %>%
arrange(CHR, BP)
sumstat5 <- sumstat4 %>%
semi_join(out2, by = c("CHR", "BP"))
save(sumstat5, file = here("Output/Temp", "sumstat5.RData"))
}
# Pulling out biallelic SNPs only and adding MAF and INFO columns, also making sure MAF etc are correct for allele1/2
if(file.exists(here("Output/Temp/biallelic_sumstat_final.RData"))) {
# load(here("Output/Temp/biallelic_sumstat_final.RData"))
# load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
} else {
tmp <- sumstat5 %>%
filter(duplicated(C_B))
sumstat5_biallelic <- sumstat5 %>%
filter(!(C_B %in% tmp$C_B))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat5_biallelic$C_B) %>%
arrange(CHR, BP)
tmp <- out2 %>%
filter(duplicated(C_B))
out2 <- out2 %>%
filter(!(C_B %in% tmp$C_B))
sumstat5_biallelic <- sumstat5_biallelic %>%
left_join(out2, by = c("CHR", "BP"))
sumstat5_biallelic <- sumstat5_biallelic %>%
semi_join(out2, by = c("CHR", "BP"))
# test <- sumstat5_biallelic %>%
# filter(str_length(Allele1.x) == 1,
# str_length(Allele1.y) == 1)
#
# test2 <- sumstat5_biallelic %>%
# filter(!(C_B.x %in% test$C_B.x))
#
# test3 <- test2 %>%
# filter(!is.na(Allele1.x)) # This just removed all of the SNPs without SNP_ID column
# sum(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # only 5 SNPs don't match by name
test <- sumstat5_biallelic %>%
filter(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # 4 are multiallelic, one is an indel (looked it up in dbSNP)
extra_multiallelic1 <- test %>%
slice(-2)
#sum(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y, na.rm = T) # 878 don't match
test <- sumstat5_biallelic %>%
filter(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y) # these are all mismatched alleles (i.e. multi-allellic)
extra_multiallelic2 <- sumstat5_biallelic %>%
filter((Allele1.x == Allele1.y & Allele2.x != Allele2.y) | (Allele1.x != Allele1.y & Allele2.x == Allele2.y))
sumstat5_biallelic1 <- sumstat5_biallelic %>%
filter(is.na(Allele1.x),
!(SNP %in% extra_multiallelic2$SNP)) %>%
rename(Allele1 = Allele1.y,
Allele2 = Allele2.y,
Effect_Allele = A1) %>%
select(-Allele1.x, -Allele2.x)
sumstat5_biallelic2 <- sumstat5_biallelic %>%
filter(Allele1.x == Allele1.y,
Allele2.x == Allele2.y,
!(SNP %in% extra_multiallelic2$SNP)) %>%
rename(Allele1 = Allele1.x,
Allele2 = Allele2.x,
Effect_Allele = A1) %>%
select(-Allele1.y, -Allele2.y)
# test <- sumstat5_biallelic %>%
# filter(!(SNP %in% sumstat5_biallelic1$SNP) & !(SNP %in% sumstat5_biallelic2$SNP))
sumstat5_biallelic3 <- sumstat5_biallelic2 %>%
select(CHR:SNP_ID, C_B.x:SNP2.y, Allele1, Allele2, MAF:C_B.y) %>%
rbind(sumstat5_biallelic1) %>%
arrange(CHR, BP)
test <- sumstat5_biallelic3 %>%
filter(Minor_Allele != Effect_Allele)
#summary(test$MAF) # all really close to 0.5 MAF, just need to flip OR, L95, and U95 then set Effect_Allele to Minor_Allele column
test <- test %>%
mutate(OR = 1/OR,
tmp = 1/L95,
tmp2 = 1/U95,
L95 = tmp2,
U95 = tmp,
Effect_Allele = Minor_Allele) %>%
rename(EAF = MAF) %>%
select(-tmp, -tmp2, -Minor_Allele)
sumstat5_biallelic4 <- sumstat5_biallelic3 %>%
filter(Minor_Allele == Effect_Allele) %>%
select(-Minor_Allele) %>%
rename(EAF = MAF) %>%
rbind(test) %>%
arrange(CHR, BP)
test <- sumstat5_biallelic4 %>%
filter(Allele2 == Effect_Allele) %>%
rename(Alternate_Allele = Allele1) %>%
select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
test2 <- sumstat5_biallelic4 %>%
filter(Allele1 == Effect_Allele) %>%
rename(Alternate_Allele = Allele2) %>%
select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
sumstat5_biallelic5 <- rbind(test, test2) %>%
arrange(CHR, BP)
biallelic_sumstat_final <- sumstat5_biallelic5
# tmp <- biallelic_sumstat_final %>%
# filter(C_B.x != C_B.y)
#
# tmp <- biallelic_sumstat_final %>%
# filter(SNP1.x != SNP2.y) # Only non-matching site was an indel (can remove)
#
# tmp <- biallelic_sumstat_final %>%
# filter(SNP2.x != SNP1.y) # This includes the non-matching site from above and every other site is just wrong in the SNP1.y column (but they are the same SNP) => keep SNP2.x
biallelic_sumstat_final <- biallelic_sumstat_final %>%
filter(SNP1.x == SNP2.y) %>%
select(-C_B.x, -C_B.y, -SNP2.y, -SNP1.y) %>%
rename(SNP1 = SNP1.x,
SNP2 = SNP2.x)
# Removing variants that aren't in the directly genotyped CoreEx data (for Polynesian analysis)
# snplist_plink2 <- biallelic_sumstat_final %>%
# mutate(BP2 = BP) %>%
# select(CHR, BP, BP2, SNP)
#write_delim(snplist_plink2, delim = " ", file = here("Output/Temp/snplist_plink2.txt"), col_names = F)
#system(paste0("source ~/.bashrc; plink1.9b4.9 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --extract range ", here("Output/Temp/snplist_plink2.txt"), " --make-bed --out ", here("Output/Temp/inCoreExGeno")))
geno <- read_delim(here("Output/Temp", "inCoreExGeno.bim"), delim = "\t", col_names = FALSE) %>%
mutate(CHR_BP = paste0(X1, "_", X4))
biallelic_sumstat_final_poly <- biallelic_sumstat_final %>%
mutate(CHR_BP = paste0(CHR, "_", BP)) %>%
filter(CHR_BP %in% geno$CHR_BP)
rm(geno, snplist_plink2)
save(biallelic_sumstat_final, file = here("Output/Temp", "biallelic_sumstat_final.RData"))
save(biallelic_sumstat_final_poly, file = here("Output/Temp", "biallelic_sumstat_final_poly.RData"))
}
# Dealing with multi-allelic SNPs (leave for later)
if(file.exists(here("Output/Temp/multi_allelic_for_later.RData"))) {
#load(here("Output/Temp/multi_allelic_for_later.RData"))
} else {
tmp <- biallelic_sumstat_final %>%
mutate(C_B = paste0(CHR, "_", BP))
sumstat5_multi <- sumstat5 %>%
filter(!(C_B %in% tmp$C_B)) # 38,991 rows contain multi-allelic sites
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat5_multi$C_B) %>%
arrange(CHR, BP) # 26,235 of the 38,991 chr/bp locations match
sumstat5_multi <- sumstat5_multi %>%
left_join(out2, by = c("CHR", "BP"))
# tmp <- sumstat5_multi %>%
# unique() # all unique
# tmp <- sumstat5_multi %>%
# filter(duplicated(C_B.x))
#
# test <- sumstat5_multi %>%
# filter(C_B.x %in% tmp$C_B.x) # 27,909 rows contain duplicated C_B.x
save(sumstat5_multi, file = here("Output/Temp", "multi_allelic_for_later.RData"))
}
# Defining one SNP per locus ---------------------------------------------------------------------------
# First, filter out P < 5e-8 SNPs and arrange by P
if(file.exists(here("Output/Temp/biallelic_signif.RData"))) {
load(here("Output/Temp/biallelic_signif.RData"))
} else {
biallelic_signif <- biallelic_sumstat_final %>%
filter(P <= 5e-8) %>%
arrange(P)
save(biallelic_signif, file = here("Output/Temp/biallelic_signif.RData"))
}
# Grouping into loci +- 500 kb of top SNPs
gout_top <- biallelic_signif %>%
slice(1)
gout2 <- biallelic_signif %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
while(nrow(gout2) > 0) {
tmp <- gout2 %>%
slice(1)
gout_top <- rbind(tmp, gout_top)
gout2 <- gout2 %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
}
gout_top <- gout_top %>%
arrange(CHR, BP)
# Finding regions of loci
biallelic_signif <- biallelic_signif %>%
arrange(CHR, BP)
out <- NA
for(i in 2:nrow(biallelic_signif)) {
if(biallelic_signif$CHR[i] == biallelic_signif$CHR[i - 1]){
out[i] <- biallelic_signif$BP[i] - biallelic_signif$BP[i - 1]
} else {
out[i] <- NA
}
}
tmp <- biallelic_signif %>%
mutate(Diff = out,
Diff2 = case_when(Diff < 500000 ~ Diff))
out <- biallelic_signif %>% slice(1)
for(i in 2:nrow(biallelic_signif)) {
if(is.na(tmp$Diff2[i])){
out <- rbind(out, biallelic_signif %>% slice(i - 1), biallelic_signif %>% slice(i))
}
}
out <- rbind(out, biallelic_signif %>% slice(nrow(biallelic_signif)))
# Extracting regions
bgen_ranges <- out %>% select(CHR, BP)
tmp1 <- bgen_ranges %>% slice(seq(1, nrow(bgen_ranges), by = 2)) %>% rename(BP1 = BP)
tmp2 <- bgen_ranges %>% slice(seq(2, nrow(bgen_ranges), by = 2)) %>% rename(CHR.x = CHR, BP2 = BP)
bgen_ranges <- tmp1 %>%
cbind(tmp2) %>%
mutate(BP1 = BP1 - 50000,
BP2 = BP2 + 50000) %>%
select(-CHR.x)
bgen_range1 <- bgen_ranges %>%
filter(CHR < 10) %>%
mutate(BGEN = paste0("0", CHR, ":", BP1, "-", BP2))
bgen_range2 <- bgen_ranges %>%
filter(CHR > 9) %>%
mutate(BGEN = paste0(CHR, ":", BP1, "-", BP2))
bgen_ranges <- rbind(bgen_range1, bgen_range2) %>%
arrange(CHR, BP1)
tmp <- bgen_ranges %>%
select(BGEN)
rm(bgen_range1, bgen_range2, tmp1, tmp2, out, gout2, i)
#write_delim(tmp, file = here("Output/Temp", "bgen_range.txt"), delim = "\n", col_names = F)
# Extracting all SNPs from biallelic sumstat that fit within boundaries of loci
if(file.exists(here("Output/Temp/biallelic_loci.RData"))) {
load(here("Output/Temp/biallelic_loci.RData"))
} else {
biallelic_loci <- tibble()
for(i in 1:nrow(bgen_ranges)){
tmp <- biallelic_sumstat_final %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i]))
biallelic_loci <- rbind(biallelic_loci, tmp)
}
biallelic_loci <- biallelic_loci %>%
mutate(SNP_ID2 = paste0(CHR, "_", BP, "_", Alternate_Allele, "_", Effect_Allele))
#save(biallelic_loci, file = here("Output/Temp/biallelic_loci.RData"))
}
tmp <- biallelic_loci %>%
mutate(BP2 = BP) %>%
select(CHR, BP, BP2, SNP)
#write_delim(tmp, file = here("Output/Temp", "biallelic_loci_snps.txt"), delim = "\t", col_names = F)
out <- c()
for(i in 1:nrow(bgen_ranges)){
tmp <- gout_top %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i])) %>%
arrange(P) %>%
slice(1)
out <- rbind(out, tmp)
}
gout_top <- out %>%
cbind(bgen_ranges %>% select(-CHR))
rm(tmp, bgen_ranges, out, i)
# Extracting all SNPs at loci from bgen files and converting to plink format -------------------------------
#system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "bgen_range.txt"), ' | bcftools reheader -h /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forclumping.vcf.gz" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forclumping.vcf.gz --extract range ', here("Output/Temp/biallelic_loci_snps.txt"), ' --pheno ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --pheno-name plink_goutaff --update-sex ', here("Data/GWAS", "gout_gwas_keep_ids_w_sex.txt"), ' --geno 0.1 --maf 0.01 --hwe 0.000001 --make-bed --out ', here("Output/Temp/"), 'chr{}_tmp" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
# Reading the bim files into R and converting their identifiers to just the rsid
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "_tmp.bim")]
for(i in file_names){
assign(i, read_delim(paste0(here("Output/Temp/"), i), delim = "\t", col_names = F))
assign(i, get(i) %>% left_join(biallelic_loci, (by = c("X1" = "CHR", "X4" = "BP"))) %>% mutate(SNP_clean = case_when(is.na(RSID) ~ SNP_ID2, TRUE ~ RSID)))
# assign(paste0(i, "_notequal"), get(i) %>% filter(X2 != SNP)) # all identical
assign(paste0("new_", i), get(i) %>% select(X1, SNP_clean, X3:X6))
#write_delim(get(paste0("new_", i)), file = paste0(here("Output/Temp/"), i), delim = "\t", col_names = F)
}
rm(list = ls()[str_detect(ls(), ".bim")], i, file_names)
# file_names <- file_names %>% as_tibble() %>% separate(value, sep = "_", into = c("name", "rm")) %>% mutate(name = paste0(name, "_tmp")) %>% pull(name)
#
# for(i in file_names){
# #system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), i, ' --set-missing-var-ids @:# --make-bed --out ', here("Output/Temp/"), i, '2'))
# assign(paste0(i, 2), read_delim(paste0(here("Output/Temp/"), i, "2.bim"), delim = "\t", col_names = F))
# exclude_list <- get(paste0(i, 2)) %>% filter(str_detect(X2, ":"))
# #write_delim(exclude_list, file = paste0(here("Output/Temp/"), i, "2_exclude.txt"), delim = "\n", col_names = F)
# #system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), i, '2 --exclude ', here("Output/Temp/"), i, '2_exclude.txt --make-bed --out ', here("Output/Temp/"), i, '3'))
# }
# Running the conditional GWAS ----------------------------------------
# Split up the plink files to have one locus per file (saves on computational time)
gout_top2 <- gout_top %>%
select(CHR, BP1, BP2, RSID)
for(i in 1:nrow(gout_top2)){
tmp <- gout_top2 %>% slice(i)
#write_delim(tmp, file = paste0(here("Output/Temp/"), "extractrange_", tmp$RSID, ".txt"), delim = "\t", col_names = F)
}
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile {1}/Output/Temp/chr{2}_tmp --extract range {1}/Output/Temp/extractrange_{3}.txt --make-bed --out {1}/Output/Temp/{3}" ::: ', paste(rep(here(), nrow(gout_top)), collapse = " "), ' ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "cat {1}/Output/Temp/{2}.bim | cut -f 2 > {1}/Output/Temp/{2}_snps; split -d -n l/10 {1}/Output/Temp/{2}_snps {1}/Output/Temp/{2}_snps_split" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# First round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {2} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition {2} --out {1}/Output/Temp/{2}_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
# Reading all the outputs into R
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top$RSID[i], "_split0", j, ".assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top$RSID[i], "_gwas"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead = SNP, new_p = P)
gout_top2 <- gout_top %>%
cbind(tmp)
gout_top_resid <- gout_top2 %>%
filter(new_p < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Second round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}'"), ' > {1}/Output/Temp/{2}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_2 --out {1}/Output/Temp/{2}_split{3}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_2.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid$RSID[i], "_split0", j, "_2.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid$RSID[i], "_gwas2"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead2 = SNP, new_p2 = P)
gout_top3 <- gout_top_resid %>%
cbind(tmp)
gout_top_resid2 <- gout_top3 %>%
filter(new_p2 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Third round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}'"), ' > {1}/Output/Temp/{2}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_3 --out {1}/Output/Temp/{2}_split{3}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_3.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid2)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid2$RSID[i], "_split0", j, "_3.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid2$RSID[i], "_gwas3"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead3 = SNP, new_p3 = P)
gout_top4 <- gout_top_resid2 %>%
cbind(tmp)
gout_top_resid3 <- gout_top4 %>%
filter(new_p3 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Fourth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}'"), ' > {1}/Output/Temp/{2}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_4 --out {1}/Output/Temp/{2}_split{3}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_4.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid3)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid3$RSID[i], "_split0", j, "_4.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid3$RSID[i], "_gwas4"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead4 = SNP, new_p4 = P)
gout_top5 <- gout_top_resid3 %>%
cbind(tmp)
gout_top_resid4 <- gout_top5 %>%
filter(new_p4 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Fifth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}\n{6}'"), ' > {1}/Output/Temp/{2}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead3, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_5 --out {1}/Output/Temp/{2}_split{3}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_5.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid4)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid4$RSID[i], "_split0", j, "_5.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid4$RSID[i], "_gwas5"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead5 = SNP, new_p5 = P)
gout_top6 <- gout_top_resid4 %>%
cbind(tmp)
gout_top_resid5 <- gout_top6 %>%
filter(new_p5 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Locus zooms ---------------------------------------
# Loading in code and gene list
source(here("Script/Functions/locus_zoom.R"))
UCSC_GRCh37_Genes_UniqueList.txt <- as.data.frame(read_delim(here("Data/GWAS/UCSC_GRCh37_Genes_UniqueList.txt"), delim = "\t"))
# Plotting locus zooms of original GWAS
# Calculating LD
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top)){
assign(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld.ld")))
}
# Making full list of SNPs for labelling
first_round <- gout_top_resid %>%
select(new_lead) %>%
rename(RSID = new_lead)
second_round <- gout_top_resid2 %>%
select(new_lead2) %>%
rename(RSID = new_lead2)
third_round <- gout_top_resid3 %>%
select(new_lead3) %>%
rename(RSID = new_lead3)
fourth_round <- gout_top_resid4 %>%
select(new_lead4) %>%
rename(RSID = new_lead4)
gout_top_full <- gout_top %>%
select(RSID) %>%
rbind(first_round, second_round, third_round, fourth_round) %>%
left_join(biallelic_loci, by = "RSID") %>%
arrange(CHR, BP)
# Plotting the locus zooms
for(i in 1:nrow(gout_top)){
locus.zoom(data = biallelic_loci %>% mutate(SNP = RSID) %>% filter(!is.na(SNP), CHR == gout_top$CHR[i] & between(BP, gout_top$BP1[i], gout_top$BP2[i])),
region = c(gout_top$CHR[i], gout_top$BP1[i], gout_top$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Unconditioned ", gout_top$RSID[i], " Locus Zoom"),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top$CHR[i], "_", gout_top$BP1[i], "_", gout_top$BP2[i], "_", gout_top$RSID[i], "_unconditioned", ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of first round of conditioning
# Remaking LD based on ALL new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top2$CHR, collapse = " "), ' ::: ', paste(gout_top2$new_lead, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top2)){
assign(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top2)){
locus.zoom(data = get(paste0(gout_top2$RSID[i], "_gwas")),
region = c(gout_top$CHR[i], gout_top2$BP1[i], gout_top2$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top2$RSID[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top2$CHR[i], "_", gout_top2$BP1[i], "_", gout_top2$BP2[i], "_", gout_top2$RSID[i], "_condition_", gout_top2$RSID[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of second round of conditioning
# Remaking LD based on new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top3$CHR, collapse = " "), ' ::: ', paste(gout_top3$new_lead2, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top3)){
assign(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top3)){
locus.zoom(data = get(paste0(gout_top3$RSID[i], "_gwas2")),
region = c(gout_top3$CHR[i], gout_top3$BP1[i], gout_top3$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top3$RSID[i], " and ", gout_top3$new_lead[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top3$CHR[i], "_", gout_top3$BP1[i], "_", gout_top3$BP2[i], "_", gout_top3$RSID[i], "_condition_", gout_top3$RSID[i], "and", gout_top3$new_lead[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of third round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top4$CHR, collapse = " "), ' ::: ', paste(gout_top4$new_lead3, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top4)){
assign(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top4)){
locus.zoom(data = get(paste0(gout_top4$RSID[i], "_gwas3")),
region = c(gout_top4$CHR[i], gout_top4$BP1[i], gout_top4$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top4$RSID[i], " and ", gout_top4$new_lead[i], " and ", gout_top4$new_lead2[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top4$CHR[i], "_", gout_top4$BP1[i], "_", gout_top4$BP2[i], "_", gout_top4$RSID[i], "_condition_", gout_top4$RSID[i], "and", gout_top4$new_lead[i], "and", gout_top4$new_lead2[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of fourth round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top5$CHR, collapse = " "), ' ::: ', paste(gout_top5$new_lead4, collapse = " ")))
# Next I need to read the ld reports back into R
for(i in 1:nrow(gout_top5)){
assign(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld.ld")))
}
for(i in 1:nrow(gout_top5)){
locus.zoom(data = get(paste0(gout_top5$RSID[i], "_gwas4")),
region = c(gout_top5$CHR[i], gout_top5$BP1[i], gout_top5$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top5$RSID[i], " and ", gout_top5$new_lead[i], " and ", gout_top5$new_lead2[i], " and ", gout_top5$new_lead3[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top5$CHR[i], "_", gout_top5$BP1[i], "_", gout_top5$BP2[i], "_", gout_top5$RSID[i], "_condition_", gout_top5$RSID[i], "and", gout_top5$new_lead[i], "and", gout_top5$new_lead2[i], "and", gout_top5$new_lead3[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of fifth round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top6$CHR, collapse = " "), ' ::: ', paste(gout_top6$new_lead5, collapse = " ")))
# Next I need to read the ld reports back into R
for(i in 1:nrow(gout_top6)){
assign(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld.ld")))
}
for(i in 1:nrow(gout_top6)){
locus.zoom(data = get(paste0(gout_top6$RSID[i], "_gwas5")),
region = c(gout_top6$CHR[i], gout_top6$BP1[i], gout_top6$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top6$RSID[i], " and ", gout_top6$new_lead[i], " and ", gout_top6$new_lead2[i], " and ", gout_top6$new_lead3[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top6$CHR[i], "_", gout_top6$BP1[i], "_", gout_top6$BP2[i], "_", gout_top6$RSID[i], "_condition_", gout_top6$RSID[i], "and", gout_top6$new_lead[i], "and", gout_top6$new_lead2[i], "and", gout_top6$new_lead3[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Combining all GWAS results together into final list ----------------------------------------------
regions <- gout_top %>%
select(CHR, BP1, BP2)
out <- c()
for(i in 1:nrow(regions)){
tmp <- gout_top_full %>%
filter(CHR == regions$CHR[i] & between(BP, regions$BP1[i], regions$BP2[i])) %>%
mutate(BP1 = regions$BP1[i],
BP2 = regions$BP2[i])
out <- rbind(out, tmp)
}
gout_top_full <- out
# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- gout_top_full %>%
filter(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1]))
tmp <- multi_snps %>% select(SNP)
#write_delim(tmp, file = here("Output/Temp/snps_to_extract.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp3 --extract ', here("Output/Temp/snps_to_extract.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_tmp4" ::: ', paste(unique(multi_snps$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr4_tmp4 --bmerge ', here("Output/Temp/"), 'chr11_tmp4 --make-bed --out ', here("Output/Temp/"), 'merged_tmp4'))
for(i in 5:8){
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_tmp4 --logistic sex --freq case-control --geno 0.1 --missing --ci 0.95 --maf 0.0001 --hwe 0.000001 --hardy --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$SNP[i], ' --out ', here("Output/Temp/"), 'new_gwas_', multi_snps$SNP[i]))
}
for(i in 1:4){
#write_delim(multi_snps %>% slice(1:4) %>% select(SNP) %>% filter(SNP != multi_snps$SNP[i]), file = paste0(here("Output/Temp/"), "conditionlist_", multi_snps$SNP[i], ".txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_tmp4 --logistic sex --freq case-control --geno 0.1 --missing --ci 0.95 --maf 0.0001 --hwe 0.000001 --hardy --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_', multi_snps$SNP[i], '.txt --out ', here("Output/Temp/"), 'new_gwas_', multi_snps$SNP[i]))
}
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "new_gwas_.+logistic")]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:4){
tmp2 <- get(paste0("new_gwas_", multi_snps$SNP[i], ".assoc.logistic")) %>% slice(i)
tmp <- rbind(tmp, tmp2)
}
tmp2 <- c()
for(i in 5:8){
tmp3 <- get(paste0("new_gwas_", multi_snps$SNP[i], ".assoc.logistic")) %>% slice(-(1:4))
tmp2 <- rbind(tmp2, tmp3)
}
tmp2 <- tmp2 %>% slice(5, 2, 15, 12)
multi_snps2 <- rbind(tmp, tmp2)
tmp <- multi_snps %>%
select(CHR:INFO, BP1, BP2)
tmp2 <- multi_snps2 %>%
select(CHR:BP, OR:U95, P)
multi_snps3 <- left_join(tmp, tmp2, by = c("CHR", "SNP", "BP")) %>%
select(CHR:INFO, OR:P, BP1:BP2)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_tmp4 --geno 0.1 --missing --maf 0.0001 --hwe 0.000001 --hardy --snps-only --r2 inter-chr --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_ld'))
merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_ld.ld"))
single_snps <- gout_top_full %>%
filter(!(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1])))
gout_top_final <- rbind(multi_snps3, single_snps) %>%
arrange(CHR, BP)
# Flipping allele order + OR etc so effect allele is always the gout risk allele + labelling based on locus zooms
smallOR <- gout_top_final %>%
filter(OR < 1) %>%
mutate(OR = as.numeric(signif(1/OR, digits = 4)),
tmp_L = as.numeric(signif(1/L95, digits = 4)),
tmp_U = as.numeric(signif(1/U95, digits = 4)),
U95 = tmp_L,
L95 = tmp_U,
EAF = 1 - EAF) %>%
rename(allele2 = Effect_Allele,
allele1 = Alternate_Allele) %>%
rename(Alternate_Allele = allele2,
Effect_Allele = allele1) %>%
select(CHR:BP, Effect_Allele, Alternate_Allele, EAF:BP2)
bigOR <- gout_top_final %>%
filter(OR > 1)
gout_top_final <- full_join(smallOR, bigOR) %>%
arrange(CHR, BP) %>%
select(-SE) %>%
mutate(Locus_Name = c("ARID1A", "PDZK1", "TRIM46", "GCKR", "SFMBT1", "SLC2A9", "SLC2A9", "SLC2A9", "SLC2A9", "ABCG2", "ABCG2", "ADH1B", "TMEM171", "RREB1", "SLC17A1", "MLXIPL", "SLC16A9", "ANO3", "SLC22A11", "SLC22A11", "OVOL1", "R3HDM2", "MLXIP", "NFAT5", "INSR", "PNPLA3"))
UKBB_Gene_OR <- gout_top_final
save(UKBB_Gene_OR, file = here("Output/UKBB_Gene_OR.RData"))
# Cleaning up
rm(list = ls()[str_detect(ls(), "^chr|^gout_top|^new_")], bigOR, duped, exclude_list, first_round, multi_snps, multi_snps2, multi_snps3, notduped, out, regions, second_round, single_snps, smallOR, test, test2, test3, third_round, tmp, tmp2, tmp3, file_names, i)
# Load in SNPs for inclusion in Polynesian analysis
load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
load(here("Output/UKBB_Gene_OR.RData"))
# Next I need to test the 26 lead SNPs from the European analysis for LD with this list of 203,787 variants in order to find the best proxies
# To make this more efficient, I should initially filter the 200,000 SNP list based on the locus boundaries for the Euro PRS
biallelic_loci_poly <- tibble()
tmp_loci <- UKBB_Gene_OR %>%
select(CHR, BP1, BP2) %>%
unique()
for(i in 1:nrow(tmp_loci)){
tmp <- biallelic_sumstat_final_poly %>%
filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
biallelic_loci_poly <- rbind(biallelic_loci_poly, tmp)
}
out <- c()
for(i in 1:nrow(UKBB_Gene_OR)){
test <- biallelic_loci_poly %>%
filter(CHR == UKBB_Gene_OR$CHR[i] & BP == UKBB_Gene_OR$BP[i])
out <- rbind(out, test)
}
need_proxies <- UKBB_Gene_OR %>%
filter(BP != out$BP)
#write_delim(need_proxies %>% select(SNP), file = here("Output/Temp/need_proxies.txt"), delim = "\t", col_names = F)
biallelic_loci_poly2 <- tibble()
tmp_loci <- need_proxies %>%
select(CHR, BP1, BP2) %>%
unique()
for(i in 1:nrow(tmp_loci)){
tmp <- biallelic_sumstat_final_poly %>%
filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
biallelic_loci_poly2 <- rbind(biallelic_loci_poly2, tmp)
}
tmp <- biallelic_loci_poly2 %>%
mutate(BP1 = BP) %>%
select(CHR, BP, BP1, SNP)
tmp2 <- need_proxies %>%
mutate(BP1 = BP) %>%
select(CHR, BP, BP1, SNP)
need_ld <- rbind(tmp, tmp2) %>%
arrange(CHR, BP)
# write_delim(need_ld, file = here("Output/Temp/need_ld.txt"), delim = "\t", col_names = F)
# Now I need to make a single UK Biobank plink file with all SNPs above and test LD between the 25 SNPs of interest and all others
# For each SNP, I will then pull out the best proxy and filter the final list for proxies with at least 0.8 R-squared with the lead SNP - either that or based on whether they associate with gout
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp3 --extract range ', here("Output/Temp/need_ld.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_poly" ::: ', paste(unique(need_ld$CHR), collapse = " ")))
#write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(need_ld$CHR), "_poly")), file = here("Output/Temp/mergefile.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile.txt --make-bed --out ', here("Output/Temp/"), 'merged_poly'))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly --geno 0.1 --missing --maf 0.0001 --hwe 0.000001 --hardy --snps-only --r2 inter-chr --ld-snp-list ', here("Output/Temp/"), 'need_proxies.txt --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_poly_ld'))
merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_poly_ld.ld"))
out <- tibble()
for(i in need_proxies$SNP){
tmp <- merged_ld %>% filter(SNP_A == i & SNP_B != i) %>% arrange(desc(R2)) %>% slice(1)
out <- rbind(out, tmp)
}
file_names <- list.files(here("Output/Plots"), full.names = T)
tmp <- file_names %>% as_tibble() %>% separate(value, sep = "_", into = c(NA, "X2", "BP1", NA, NA, "Cond", "CondSNPs")) %>%
rownames_to_column() %>% mutate(Cond1 = case_when(Cond == "unconditioned.jpg" ~ FALSE, TRUE ~ TRUE), BP1 = as.numeric(BP1), rowname = as.numeric(rowname)) %>% separate(X2, sep = "/", into = c(NA, NA, NA, NA, NA, NA, NA, "CHR")) %>% mutate(CHR = as.numeric(str_replace(CHR, "Chr", ""))) %>% arrange(CHR, BP1, Cond1, CondSNPs) %>% pull(rowname)
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 21 rows [2, 4, 6,
## 8, 10, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 34, 39, 41, 43, 45, ...].
file_names2 <- file_names[tmp]
include_graphics(file_names2)